#import the important packages
import pandas as pd #library used for data manipulation and analysis
import numpy as np # library used for working with arrays.
import matplotlib.pyplot as plt # library for plots and visualisations
import seaborn as sns # library for visualisations
import plotly.express as px # library for visualisations
%matplotlib inline
import scipy.stats as stats # this library contains a large number of probability distributions as well as a growing library of statistical functions.
from scipy.stats import *
import warnings # ignore warnings
warnings.filterwarnings("ignore")
# install pandas sql
!pip install -U pandasql
import pandasql as psql # python library that allows manipulation of a Pandas Dataframe using SQL
from pandasql import sqldf
Requirement already satisfied: pandasql in c:\users\abhinav roy\anaconda3\lib\site-packages (0.7.3) Requirement already satisfied: sqlalchemy in c:\users\abhinav roy\anaconda3\lib\site-packages (from pandasql) (1.4.32) Requirement already satisfied: pandas in c:\users\abhinav roy\anaconda3\lib\site-packages (from pandasql) (1.4.2) Requirement already satisfied: numpy in c:\users\abhinav roy\anaconda3\lib\site-packages (from pandasql) (1.21.5) Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\abhinav roy\anaconda3\lib\site-packages (from pandas->pandasql) (2.8.2) Requirement already satisfied: pytz>=2020.1 in c:\users\abhinav roy\anaconda3\lib\site-packages (from pandas->pandasql) (2021.3) Requirement already satisfied: six>=1.5 in c:\users\abhinav roy\anaconda3\lib\site-packages (from python-dateutil>=2.8.1->pandas->pandasql) (1.16.0) Requirement already satisfied: greenlet!=0.4.17 in c:\users\abhinav roy\anaconda3\lib\site-packages (from sqlalchemy->pandasql) (1.1.1)
#1A
P_A = (400/2000) # P(Planned to purchase| Actually placed an order)
print('Joint probability of the people who planned to purchase and actually placed an order is %1.4f' % P_A)
Joint probability of the people who planned to purchase and actually placed an order is 0.2000
#2B
P_B = (400 / 2000 )/ (500 / 2000) # P(Actually placed an order | Planned to purchase and actually placed an order)
print('Joint probability of the people who planned to purchase and actually placed an order, given that people planned to purchase is %1.4f' % P_B)
Joint probability of the people who planned to purchase and actually placed an order, given that people planned to purchase is 0.8000
p2 = 0.05 # failure rate
n2 = 10 # sample size
k2 = np.arange(11) # array of 10 items
binom = binom.pmf(k2, n2, p2)
binom
array([5.98736939e-01, 3.15124705e-01, 7.46347985e-02, 1.04750594e-02,
9.64808106e-04, 6.09352488e-05, 2.67259863e-06, 8.03789062e-08,
1.58642578e-09, 1.85546875e-11, 9.76562500e-14])
#2A
print(' Probability that none of the items are defective is %1.4f' % binom[0])
Probability that none of the items are defective is 0.5987
#2B
print(' Probability that exactly one of the items is defective is %1.4f' % binom[1])
Probability that exactly one of the items is defective is 0.3151
import scipy.stats as stats # this library contains a large number of probability distributions as well as a growing library of statistical functions.
from scipy.stats import *
cubinom = binom.cdf(k2, n2, p2)
cubinom
array([0.59873694, 0.91386164, 0.98849644, 0.9989715 , 0.99993631,
0.99999725, 0.99999992, 1. , 1. , 1. ,
1. ])
#2C
print(' Probability that two or fewer of the items are defective is %1.4f' % cubinom[2])
Probability that two or fewer of the items are defective is 0.9885
#2D
print(' Probability that three or more of the items are defective is %1.4f' % (1-cubinom[2]))
Probability that three or more of the items are defective is 0.0115
rate = 3 # number of cars per week
n3 = np.arange(0,31)
cumpoisson = poisson.cdf(n3,rate)
cumpoisson
array([0.04978707, 0.19914827, 0.42319008, 0.64723189, 0.81526324,
0.91608206, 0.96649146, 0.9880955 , 0.99619701, 0.99889751,
0.99970766, 0.99992861, 0.99998385, 0.9999966 , 0.99999933,
0.99999988, 0.99999998, 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. , 1. , 1. , 1. , 1. ,
1. ])
#3A
print('Probability that in a given week he will sell some cars is %1.4f' % (1-cumpoisson[0]))
Probability that in a given week he will sell some cars is 0.9502
#3B
print('Probability that in a given week he will sell some cars is %1.4f' % (cumpoisson[4]-cumpoisson[2]))
Probability that in a given week he will sell some cars is 0.3921
#3C
plt.plot(n3, cumpoisson, 'o-')
plt.title('Poisson Distribution Function')
plt.xlabel('Number of cars sold per week')
plt.ylabel('Cumulative probability of cars sold per-week')
plt.show()
p4 = 0.868
n4 = 3
k4 = np.arange(4)
binom4 = binom.pmf(k4, n4, p4)
binom4
array([0.00229997, 0.0453721 , 0.2983559 , 0.65397203])
#4A
print('Probability that all three orders will be recognised correctly is %1.4f' % binom4[3])
Probability that all three orders will be recognised correctly is 0.6540
#4B
print('Probability that none of the three orders will be recognised correctly is %1.4f' % binom4[0])
Probability that none of the three orders will be recognised correctly is 0.0023
cumbinom4 = binom.cdf(k4, n4, p4)
cumbinom4
array([0.00229997, 0.04767206, 0.34602797, 1. ])
#4C
cumbinom4c = 1 - (cumbinom4[0] + cumbinom4[1])
print('Probability that at least two of the three orders will be recognised correctly is %1.4f' % cumbinom4c)
Probability that at least two of the three orders will be recognised correctly is 0.9500
Taking the example of Covid-19:
1) Binomial Distribution can be used to estimate the number of Covid positive cases in an area
2) Poisson Distribution can be used to estimate the number of Covid positive cases in a city per day
3) Normal Distribution can be used to analyse the central tendancy and dispersion of the number of Covid positive cases in a city by taking the log of the population sample
4) Uniform Distribution can be used to find the outcome of a Covid RT-PCR test, i.e, positive or negative, both of which are equally likely
5) Central Limit Theorem can be used to study and draw conclusions about the overall population parameters of different Coronavirus variants.
6) Hypothesis Testing can be used to create a faster alternative Rapid Covid test with 95% cofidence interval
• DOMAIN: Sports • CONTEXT: Company X manages the men's top professional basketball division of the American league system. The dataset contains information on all the teams that have participated in all the past tournaments. It has data about how many baskets each team scored, conceded, how many times they came within the first 2 positions, how many tournaments they have qualified, their best position in the past, etc. • DATA DESCRIPTION: Basketball.csv - The data set contains information on all the teams so far participated in all the past tournaments. • DATA DICTIONARY:
• PROJECT OBJECTIVE: Company’s management wants to invest on proposals on managing some of the best teams in the league. The analytics department has been assigned with a task of creating a report on the performance shown by the teams. Some of the older teams are already in contract with competitors. Hence Company X wants to understand which teams they can approach which will be a deal win for them. • STEPS AND TASK [30 Marks]:
Read the data set, clean the data and prepare final dataset to be used for analysis. [10 Marks]
Perform detailed statistical analysis and EDA using univariate, bi-variate and multivariate EDA techniques to get data driven insights on recommending which teams they can approach which will be a deal win for them. Also as a data and statistics expert you have to develop a detailed performance report using this data. [10 Marks] Hint: Use statistical techniques and visualisation techniques to come up with useful metrics and reporting. Find out the best performing team, oldest team, team with highest goals, team with lowest performance etc. and many more. These are just random examples. Please use your best analytical approach to build this report. You can mix match columns to create new ones which can be used for better analysis. Create your own features if required. Be highly experimental and analytical here to find hidden patterns. Use graphical interactive libraries to enable you to publish interactive plots in python.
Please include any improvements or suggestions to the association management on quality, quantity, variety, velocity, veracity etc. on the data points collected by the association to perform a better data analysis in future. At-least 1 suggestion for each point. [10 Marks]
#load the dataset
df = pd.read_csv("Basketball.csv")
df.head()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86 | 4385 | 2762 | 1647 | 552 | 563 | 5947 | 3140 | 33 | 23 | 1929 | 1 |
| 1 | Team 2 | 86 | 4262 | 2762 | 1581 | 573 | 608 | 5900 | 3114 | 25 | 25 | 1929 | 1 |
| 2 | Team 3 | 80 | 3442 | 2614 | 1241 | 598 | 775 | 4534 | 3309 | 10 | 8 | 1929 | 1 |
| 3 | Team 4 | 82 | 3386 | 2664 | 1187 | 616 | 861 | 4398 | 3469 | 6 | 6 | 1931to32 | 1 |
| 4 | Team 5 | 86 | 3368 | 2762 | 1209 | 633 | 920 | 4631 | 3700 | 8 | 7 | 1929 | 1 |
# dataframe shape
df.shape
(61, 13)
# dataframe information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Team 61 non-null object 1 Tournament 61 non-null int64 2 Score 61 non-null object 3 PlayedGames 61 non-null object 4 WonGames 61 non-null object 5 DrawnGames 61 non-null object 6 LostGames 61 non-null object 7 BasketScored 61 non-null object 8 BasketGiven 61 non-null object 9 TournamentChampion 61 non-null object 10 Runner-up 61 non-null object 11 TeamLaunch 61 non-null object 12 HighestPositionHeld 61 non-null int64 dtypes: int64(2), object(11) memory usage: 6.3+ KB
# dataframe description
df.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Team | 61 | 61 | Team 1 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Tournament | 61.0 | NaN | NaN | NaN | 24.0 | 26.827225 | 1.0 | 4.0 | 12.0 | 38.0 | 86.0 |
| Score | 61 | 61 | 4385 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| PlayedGames | 61 | 53 | 2762 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| WonGames | 61 | 59 | 7 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| DrawnGames | 61 | 57 | 14 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| LostGames | 61 | 56 | 37 | 3 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BasketScored | 61 | 60 | 70 | 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| BasketGiven | 61 | 61 | 3140 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| TournamentChampion | 61 | 8 | - | 52 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Runner-up | 61 | 10 | - | 48 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| TeamLaunch | 61 | 47 | 1929 | 10 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| HighestPositionHeld | 61.0 | NaN | NaN | NaN | 7.081967 | 5.276663 | 1.0 | 3.0 | 6.0 | 10.0 | 20.0 |
#Check the null values in all fields
df.isna().sum()
Team 0 Tournament 0 Score 0 PlayedGames 0 WonGames 0 DrawnGames 0 LostGames 0 BasketScored 0 BasketGiven 0 TournamentChampion 0 Runner-up 0 TeamLaunch 0 HighestPositionHeld 0 dtype: int64
#Replace NAN values with 0 since dataset is not large
df.replace(np.nan, 0, inplace = True)
df.tail()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | - | - | 2009-10 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | - | - | 1956-57 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | - | - | 1951~52 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | - | - | 1955-56 | 15 |
| 60 | Team 61 | 1 | - | - | - | - | - | - | - | - | - | 2017~18 | 9 |
#Substring to pick the first year as Team Launch year only
df['TeamLaunch'] = df['TeamLaunch'].apply(lambda x: x[0:4])
df.tail()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 56 | Team 57 | 1 | 34 | 38 | 8 | 10 | 20 | 38 | 66 | - | - | 2009 | 20 |
| 57 | Team 58 | 1 | 22 | 30 | 7 | 8 | 15 | 37 | 57 | - | - | 1956 | 16 |
| 58 | Team 59 | 1 | 19 | 30 | 7 | 5 | 18 | 51 | 85 | - | - | 1951 | 16 |
| 59 | Team 60 | 1 | 14 | 30 | 5 | 4 | 21 | 34 | 65 | - | - | 1955 | 15 |
| 60 | Team 61 | 1 | - | - | - | - | - | - | - | - | - | 2017 | 9 |
# Convert all columns except Team to float datatype
cols = df.columns
df[cols[1:]] = df[cols[1:]].apply(pd.to_numeric, errors = 'coerce', axis = 1)
# dataframe information
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 61 entries, 0 to 60 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Team 61 non-null object 1 Tournament 61 non-null float64 2 Score 60 non-null float64 3 PlayedGames 60 non-null float64 4 WonGames 60 non-null float64 5 DrawnGames 60 non-null float64 6 LostGames 60 non-null float64 7 BasketScored 60 non-null float64 8 BasketGiven 60 non-null float64 9 TournamentChampion 9 non-null float64 10 Runner-up 13 non-null float64 11 TeamLaunch 61 non-null float64 12 HighestPositionHeld 61 non-null float64 dtypes: float64(12), object(1) memory usage: 6.3+ KB
# dataframe description
df.describe(include = 'all').T
| count | unique | top | freq | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Team | 61 | 61 | Team 1 | 1 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| Tournament | 61.0 | NaN | NaN | NaN | 24.0 | 26.827225 | 1.0 | 4.0 | 12.0 | 38.0 | 86.0 |
| Score | 60.0 | NaN | NaN | NaN | 916.45 | 1138.342899 | 14.0 | 104.25 | 395.5 | 1360.5 | 4385.0 |
| PlayedGames | 60.0 | NaN | NaN | NaN | 810.1 | 877.465393 | 30.0 | 115.5 | 424.5 | 1345.5 | 2762.0 |
| WonGames | 60.0 | NaN | NaN | NaN | 309.033333 | 408.481395 | 5.0 | 34.75 | 124.0 | 432.75 | 1647.0 |
| DrawnGames | 60.0 | NaN | NaN | NaN | 192.083333 | 201.985508 | 4.0 | 26.25 | 98.5 | 331.5 | 633.0 |
| LostGames | 60.0 | NaN | NaN | NaN | 308.816667 | 294.508639 | 15.0 | 62.75 | 197.5 | 563.5 | 1070.0 |
| BasketScored | 60.0 | NaN | NaN | NaN | 1159.35 | 1512.063948 | 34.0 | 154.5 | 444.0 | 1669.75 | 5947.0 |
| BasketGiven | 60.0 | NaN | NaN | NaN | 1159.233333 | 1163.946914 | 55.0 | 236.0 | 632.5 | 2001.25 | 3889.0 |
| TournamentChampion | 9.0 | NaN | NaN | NaN | 9.666667 | 11.61895 | 1.0 | 1.0 | 6.0 | 10.0 | 33.0 |
| Runner-up | 13.0 | NaN | NaN | NaN | 6.615385 | 8.109033 | 1.0 | 1.0 | 4.0 | 7.0 | 25.0 |
| TeamLaunch | 61.0 | NaN | NaN | NaN | 1958.918033 | 27.484114 | 1929.0 | 1935.0 | 1951.0 | 1978.0 | 2017.0 |
| HighestPositionHeld | 61.0 | NaN | NaN | NaN | 7.081967 | 5.276663 | 1.0 | 3.0 | 6.0 | 10.0 | 20.0 |
#Plotting boxplot for Individual fields for dataframe df
plt.figure(figsize=(15,7))
index = 1
for col in df.columns[1:]:
plt.subplot(4,3,index)
sns.boxplot(df[col], color = 'violet')
index += 1;
plt.tight_layout()
#plotting heatmap for dataframe df
plt.figure(figsize = (15,7))
sns.heatmap(df.corr(), annot = True, cmap = 'rainbow');
# create new dataframe df2 to add derived fields
df2 = df.copy(deep = True)
df2.head()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86.0 | 4385.0 | 2762.0 | 1647.0 | 552.0 | 563.0 | 5947.0 | 3140.0 | 33.0 | 23.0 | 1929.0 | 1.0 |
| 1 | Team 2 | 86.0 | 4262.0 | 2762.0 | 1581.0 | 573.0 | 608.0 | 5900.0 | 3114.0 | 25.0 | 25.0 | 1929.0 | 1.0 |
| 2 | Team 3 | 80.0 | 3442.0 | 2614.0 | 1241.0 | 598.0 | 775.0 | 4534.0 | 3309.0 | 10.0 | 8.0 | 1929.0 | 1.0 |
| 3 | Team 4 | 82.0 | 3386.0 | 2664.0 | 1187.0 | 616.0 | 861.0 | 4398.0 | 3469.0 | 6.0 | 6.0 | 1931.0 | 1.0 |
| 4 | Team 5 | 86.0 | 3368.0 | 2762.0 | 1209.0 | 633.0 | 920.0 | 4631.0 | 3700.0 | 8.0 | 7.0 | 1929.0 | 1.0 |
#Add new field to combine the number of times a Team finished in the Top 2
top_2_finishes = df2['TournamentChampion'] + df2['Runner-up']
df2.insert(11, 'TopTwoFinishes', top_2_finishes)
df2.head()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | TournamentChampion | Runner-up | TopTwoFinishes | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86.0 | 4385.0 | 2762.0 | 1647.0 | 552.0 | 563.0 | 5947.0 | 3140.0 | 33.0 | 23.0 | 56.0 | 1929.0 | 1.0 |
| 1 | Team 2 | 86.0 | 4262.0 | 2762.0 | 1581.0 | 573.0 | 608.0 | 5900.0 | 3114.0 | 25.0 | 25.0 | 50.0 | 1929.0 | 1.0 |
| 2 | Team 3 | 80.0 | 3442.0 | 2614.0 | 1241.0 | 598.0 | 775.0 | 4534.0 | 3309.0 | 10.0 | 8.0 | 18.0 | 1929.0 | 1.0 |
| 3 | Team 4 | 82.0 | 3386.0 | 2664.0 | 1187.0 | 616.0 | 861.0 | 4398.0 | 3469.0 | 6.0 | 6.0 | 12.0 | 1931.0 | 1.0 |
| 4 | Team 5 | 86.0 | 3368.0 | 2762.0 | 1209.0 | 633.0 | 920.0 | 4631.0 | 3700.0 | 8.0 | 7.0 | 15.0 | 1929.0 | 1.0 |
#Add new field to find the Basket difference for a Team
basket_diff = df2['BasketScored'] - df2['BasketGiven']
df2.insert(9, 'BasketDiff', basket_diff)
df2.head()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | BasketScored | BasketGiven | BasketDiff | TournamentChampion | Runner-up | TopTwoFinishes | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86.0 | 4385.0 | 2762.0 | 1647.0 | 552.0 | 563.0 | 5947.0 | 3140.0 | 2807.0 | 33.0 | 23.0 | 56.0 | 1929.0 | 1.0 |
| 1 | Team 2 | 86.0 | 4262.0 | 2762.0 | 1581.0 | 573.0 | 608.0 | 5900.0 | 3114.0 | 2786.0 | 25.0 | 25.0 | 50.0 | 1929.0 | 1.0 |
| 2 | Team 3 | 80.0 | 3442.0 | 2614.0 | 1241.0 | 598.0 | 775.0 | 4534.0 | 3309.0 | 1225.0 | 10.0 | 8.0 | 18.0 | 1929.0 | 1.0 |
| 3 | Team 4 | 82.0 | 3386.0 | 2664.0 | 1187.0 | 616.0 | 861.0 | 4398.0 | 3469.0 | 929.0 | 6.0 | 6.0 | 12.0 | 1931.0 | 1.0 |
| 4 | Team 5 | 86.0 | 3368.0 | 2762.0 | 1209.0 | 633.0 | 920.0 | 4631.0 | 3700.0 | 931.0 | 8.0 | 7.0 | 15.0 | 1929.0 | 1.0 |
#Add new field to find the Net Game Points for individual Team as a better way to analyse a Team's performance
# WonGames = 3 points, LostGames = 0 points, DrawnGames = 1 point
net_game_points = (df2['WonGames'] * 3) + (df2['LostGames'] * 0) + (df2['DrawnGames'] * 1)
df2.insert(7, 'NetGamePoints', net_game_points)
df2.head()
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | NetGamePoints | BasketScored | BasketGiven | BasketDiff | TournamentChampion | Runner-up | TopTwoFinishes | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86.0 | 4385.0 | 2762.0 | 1647.0 | 552.0 | 563.0 | 5493.0 | 5947.0 | 3140.0 | 2807.0 | 33.0 | 23.0 | 56.0 | 1929.0 | 1.0 |
| 1 | Team 2 | 86.0 | 4262.0 | 2762.0 | 1581.0 | 573.0 | 608.0 | 5316.0 | 5900.0 | 3114.0 | 2786.0 | 25.0 | 25.0 | 50.0 | 1929.0 | 1.0 |
| 2 | Team 3 | 80.0 | 3442.0 | 2614.0 | 1241.0 | 598.0 | 775.0 | 4321.0 | 4534.0 | 3309.0 | 1225.0 | 10.0 | 8.0 | 18.0 | 1929.0 | 1.0 |
| 3 | Team 4 | 82.0 | 3386.0 | 2664.0 | 1187.0 | 616.0 | 861.0 | 4177.0 | 4398.0 | 3469.0 | 929.0 | 6.0 | 6.0 | 12.0 | 1931.0 | 1.0 |
| 4 | Team 5 | 86.0 | 3368.0 | 2762.0 | 1209.0 | 633.0 | 920.0 | 4260.0 | 4631.0 | 3700.0 | 931.0 | 8.0 | 7.0 | 15.0 | 1929.0 | 1.0 |
# Plotting the heat map for dataframe df2
plt.figure(figsize = (15,7))
sns.heatmap(df2.corr(), annot = True, cmap = 'rainbow');
# Plotting the pairplot for dataframe df2
sns.pairplot(data = df2[['Tournament','Score','NetGamePoints','BasketDiff','TopTwoFinishes']], corner=True);
# Plotting the line plot for dataframe df2
plt.figure(figsize=(20,10))
sns.lineplot('Team', 'Tournament', data = df2, color='Violet')
sns.lineplot('Team', 'PlayedGames', data = df2, color='Black')
sns.lineplot('Team', 'Score', data = df2, color='Orange')
sns.lineplot('Team', 'NetGamePoints', data = df2, color='Green')
sns.lineplot('Team', 'BasketDiff', data = df2, color='Red')
sns.lineplot('Team', 'TopTwoFinishes', data = df2, color='Blue')
plt.grid()
plt.xticks(rotation=45)
plt.ylabel('Count')
plt.legend(labels=['Tournament','Played Games', 'Score', 'Net Game Points', 'Basket Difference', 'Top Two Finishes'])
plt.title('Teams vs Performance metrics in Tournaments');
# create new dataframe df3 to analyse the top Teams
df3 = df2[df2['TopTwoFinishes'] > 0].copy(deep = True)
df3
| Team | Tournament | Score | PlayedGames | WonGames | DrawnGames | LostGames | NetGamePoints | BasketScored | BasketGiven | BasketDiff | TournamentChampion | Runner-up | TopTwoFinishes | TeamLaunch | HighestPositionHeld | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Team 1 | 86.0 | 4385.0 | 2762.0 | 1647.0 | 552.0 | 563.0 | 5493.0 | 5947.0 | 3140.0 | 2807.0 | 33.0 | 23.0 | 56.0 | 1929.0 | 1.0 |
| 1 | Team 2 | 86.0 | 4262.0 | 2762.0 | 1581.0 | 573.0 | 608.0 | 5316.0 | 5900.0 | 3114.0 | 2786.0 | 25.0 | 25.0 | 50.0 | 1929.0 | 1.0 |
| 2 | Team 3 | 80.0 | 3442.0 | 2614.0 | 1241.0 | 598.0 | 775.0 | 4321.0 | 4534.0 | 3309.0 | 1225.0 | 10.0 | 8.0 | 18.0 | 1929.0 | 1.0 |
| 3 | Team 4 | 82.0 | 3386.0 | 2664.0 | 1187.0 | 616.0 | 861.0 | 4177.0 | 4398.0 | 3469.0 | 929.0 | 6.0 | 6.0 | 12.0 | 1931.0 | 1.0 |
| 4 | Team 5 | 86.0 | 3368.0 | 2762.0 | 1209.0 | 633.0 | 920.0 | 4260.0 | 4631.0 | 3700.0 | 931.0 | 8.0 | 7.0 | 15.0 | 1929.0 | 1.0 |
| 5 | Team 6 | 73.0 | 2819.0 | 2408.0 | 990.0 | 531.0 | 887.0 | 3501.0 | 3680.0 | 3373.0 | 307.0 | 1.0 | 4.0 | 5.0 | 1934.0 | 1.0 |
| 7 | Team 8 | 70.0 | 2573.0 | 2302.0 | 864.0 | 577.0 | 861.0 | 3169.0 | 3228.0 | 3230.0 | -2.0 | 2.0 | 3.0 | 5.0 | 1929.0 | 1.0 |
| 10 | Team 11 | 45.0 | 1814.0 | 1530.0 | 563.0 | 392.0 | 575.0 | 2081.0 | 2052.0 | 2188.0 | -136.0 | 1.0 | 5.0 | 6.0 | 1941.0 | 1.0 |
# Plotting the pie plot for dataframe df3
plt.figure(figsize=(15,7))
plt.pie(df3['TopTwoFinishes'], labels= df3['Team'])
plt.title('Teams with most the most Top 2 finishes in Tournaments')
plt.show()
#Plotting lineplot for Individual fields for dataframe df3
plt.figure(figsize=(20,10))
sns.lineplot(y = 'TopTwoFinishes', x = 'Tournament', data = df3, color='Purple')
sns.lineplot(y = 'TopTwoFinishes', x = 'PlayedGames', data = df3, color='Black')
sns.lineplot(y = 'TopTwoFinishes', x = 'Score', data = df3, color='Orange')
sns.lineplot(y = 'TopTwoFinishes', x = 'NetGamePoints', data = df3, color='Green')
sns.lineplot(y = 'TopTwoFinishes', x = 'BasketDiff', data = df3, color='Red')
plt.grid()
plt.xlabel('x-axis')
plt.legend(labels=['Tournament','Played Games', 'Score', 'Net Game Points', 'Basket Difference'])
plt.title('Top Two Finishes vs Performance metrics in Tournaments');
#Plotting lineplot subplot for Individual fields for dataframe df3
plt.figure(figsize=(15,7))
index = 1
for col in df.columns[1:]:
plt.subplot(4,3,index)
sns.lineplot(y = 'TopTwoFinishes', x = col, data = df3)
index += 1
plt.grid();
plt.tight_layout()
#Plotting histplot subplot for Individual fields for dataframe df3
plt.figure(figsize=(20,10))
index = 1
for col in df.columns[1:]:
plt.subplot(4,3,index)
sns.histplot(x = col, data = df3, kde = True)
index += 1
plt.grid();
plt.tight_layout()
• DOMAIN: Startup ecosystem • CONTEXT: C • DATA DESCRIPTION: CompanyX_EU.csv - Each row in the dataset is a Start-up company and the columns describe the company. • DATA DICTIONARY:
• PROJECT OBJECTIVE: Analyse the data of the various companies from the given dataset and perform the tasks that are specified in the below steps. Draw insights from the various attributes that are present in the dataset, plot distributions, state hypotheses and draw conclusions from the dataset. • STEPS AND TASK [15 Marks]:
#load the dataset df_1
df_1 = pd.read_csv("CompanyX_EU.csv")
df_1.head()
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 0 | 2600Hz | 2600hz.com | NaN | Disrupt SF 2013 | Contestant | Operating |
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 2 | 3DPrinterOS | 3dprinteros.com | NaN | Disrupt SF 2016 | Contestant | Operating |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 4 | 42 Technologies | 42technologies.com | NaN | Disrupt NYC 2013 | Contestant | Operating |
# dataframe df_1 information
df_1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 662 entries, 0 to 661 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Startup 662 non-null object 1 Product 656 non-null object 2 Funding 448 non-null object 3 Event 662 non-null object 4 Result 662 non-null object 5 OperatingState 662 non-null object dtypes: object(6) memory usage: 31.2+ KB
# dataframe df_1 description
df_1.describe(include = 'all').T
| count | unique | top | freq | |
|---|---|---|---|---|
| Startup | 662 | 662 | 2600Hz | 1 |
| Product | 656 | 656 | 2600hz.com | 1 |
| Funding | 448 | 240 | $1M | 17 |
| Event | 662 | 26 | TC50 2008 | 52 |
| Result | 662 | 5 | Contestant | 488 |
| OperatingState | 662 | 4 | Operating | 465 |
# Check the datatypes of each attribute in dataframe df_1
df_1.dtypes
Startup object Product object Funding object Event object Result object OperatingState object dtype: object
# Check for null values in the attributes in dataframe df_1
df_1.isna().sum()
Startup 0 Product 6 Funding 214 Event 0 Result 0 OperatingState 0 dtype: int64
# Drop the null values in dataframe df_1
df_1.dropna(inplace = True)
# Check the null values in all fields in dataframe df_1
df_1.isna().sum()
Startup 0 Product 0 Funding 0 Event 0 Result 0 OperatingState 0 dtype: int64
df_1.head()
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating |
# create a new dataframe df_2 for further analysis
df_2 = df_1.copy(deep = True)
df_2.head()
| Startup | Product | Funding | Event | Result | OperatingState | |
|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating |
# Convert the Funding features to a numerical value
df_2.loc[:,'Funds_in_million'] = df_2['Funding'].apply(lambda x: float(x[1:-1])/1000 if x[-1] == 'K' else (float(x[1:-1])*1000 if x[-1] == 'B' else float(x[1:-1])))
# dataframe df_2 information
df_2.head()
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired | 19.30 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating | 1.80 |
# Plot box plot for Funding for dataframe df_2
fig = px.box(df_2, x = "Funding")
fig.show()
# Plot box plot for funds in million for dataframe df_2
#plt.boxplot(df_2.Funds_in_million)
fig = px.box(df_2, x="Funds_in_million")
fig.show()
# Plot box plot for funds in million for dataframe df_2 using matplotlib
plot = plt.boxplot(df_2.Funds_in_million)
plt.show();
# calculate the lower fence value for the boxplot
lower_fence = plot['caps'][0].get_data()[1][1]
lower_fence
0.005
# calculate the upper fence value for the boxplot
upper_fence = plot['caps'][1].get_data()[1][1]
upper_fence
22.0
# Startups that have received more funding than boxplot plot outlier
df_2[df_2['Funds_in_million'] > upper_fence].head()
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.0 |
| 31 | Anyclip | anyclip.com | $24M | TC50 2009 | Finalist | Operating | 24.0 |
| 40 | Artsy | artsy.net | $50.9M | Disrupt NYC 2010 | Contestant | Operating | 50.9 |
| 49 | Badgeville | badgeville.com | $40M | Disrupt SF 2010 | Finalist | Acquired | 40.0 |
| 56 | Betterment | betterment.com | $205M | Disrupt NYC 2010 | Finalist | Operating | 205.0 |
# Check the number of outliers greater than the upper fence
up_fence_outliers = df_2['Startup'][df_2['Funds_in_million'] > upper_fence].count()
print("Number of outliers greater than the upper fence is", up_fence_outliers)
Number of outliers greater than the upper fence is 60
# Check frequency of the OperatingState features classes
df_2.OperatingState.value_counts()
Operating 319 Acquired 66 Closed 57 Ipo 4 Name: OperatingState, dtype: int64
# Plot frequency of the OperatingState features classes
cplot = px.bar(df_2, x = "OperatingState" , color = 'OperatingState')
cplot.show()
# Count of startups that are Operating
df_2['Startup'][df_2['OperatingState'] == 'Operating'].count()
319
# Count of startups that have Closed
df_2['Startup'][df_2['OperatingState'] == 'Closed'].count()
57
# Plot a sublplot to compare the Funds raised by companies that are still operating vs companies that closed down
fig, ax = plt.subplots(1,2)
fig.set_figwidth(15)
fig.set_figheight(7)
sns.distplot(x = df_2['Funds_in_million'][df_2['OperatingState'] == 'Operating'], ax = ax[0])
ax[0].set_title("Funding for Operating Startups")
sns.distplot(x = df_2['Funds_in_million'][df_2['OperatingState'] == 'Closed'], ax = ax[1])
ax[1].set_title("Funding for Closed Startups")
plt.show();
H0 = There is no significant difference between Funds raised by companies that are still operating vs companies that closed down
Ha = There is significant difference between Funds raised by companies that are still operating vs companies that closed down
# import z test to find z_stat, p_val
from statsmodels.stats.weightstats import ztest
alpha = 0.05 # considering 95% Confidence Interval
operating = df_2['Funds_in_million'][df_2['OperatingState'] == 'Operating']
closed = df_2['Funds_in_million'][df_2['OperatingState'] == 'Closed']
z_stat, p_val = ztest(operating, closed)
z_stat, p_val
(1.1382924515740138, 0.25499838490589677)
# check if calculated p_val is greater/ less than alpha
if p_val > alpha:
print("Since p_val of {} is greater than alpha value of {}, we Fail to reject H0".format(round(p_val,2), alpha))
else:
print("Since p_val of {} is less than alpha value of {}, we Reject H0".format(p_val, alpha))
Since p_val of 0.25 is greater than alpha value of 0.05, we Fail to reject H0
# find the z critical at alpha = 0.05
z_crit = norm.isf(0.05/2)
z_crit
1.9599639845400545
# check if calculated z_stat is greater/ less than z_crit
if z_crit > z_stat:
print("Since z_crit of {} is greater than z_stat of {}, we Fail to reject H0".format(round(z_crit,2), round(z_stat,2)))
else:
print("Since z_crit of {} is less than z_stat of {}, we Reject H0".format(round(z_crit,2), round(z_stat,2)))
Since z_crit of 1.96 is greater than z_stat of 1.14, we Fail to reject H0
# create a new dataframe df_3
df_3 = df_2.copy(deep = True)
df_3.head()
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|---|
| 1 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 |
| 3 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 |
| 5 | 5to1 | 5to1.com | $19.3M | TC50 2009 | Contestant | Acquired | 19.30 |
| 6 | 8 Securities | 8securities.com | $29M | Disrupt Beijing 2011 | Finalist | Operating | 29.00 |
| 10 | AdhereTech | adheretech.com | $1.8M | Hardware Battlefield 2014 | Contestant | Operating | 1.80 |
# Check frequency distribution of Result variables
df_3['Result'].value_counts()
Contestant 312 Finalist 65 Audience choice 25 Winner 25 Runner up 19 Name: Result, dtype: int64
# Plot frequency of the Result features classes
cplot1 = px.bar(df_3, x = "Result" , color = 'Result')
cplot1.show()
# Sort dataframe df_3 base on the Result of each Startup
df_3['Result'].value_counts()
Contestant 312 Finalist 65 Audience choice 25 Winner 25 Runner up 19 Name: Result, dtype: int64
# Check the total contestants within the Startups
total_contestants = df_3['Result'].value_counts()[0]
total_contestants
312
df_4 = df_3[df_3['OperatingState'] == 'Operating'].groupby('Result').count()
df_4
| Startup | Product | Funding | Event | OperatingState | Funds_in_million | |
|---|---|---|---|---|---|---|
| Result | ||||||
| Audience choice | 23 | 23 | 23 | 23 | 23 | 23 |
| Contestant | 214 | 214 | 214 | 214 | 214 | 214 |
| Finalist | 46 | 46 | 46 | 46 | 46 | 46 |
| Runner up | 18 | 18 | 18 | 18 | 18 | 18 |
| Winner | 18 | 18 | 18 | 18 | 18 | 18 |
# Calculate the number of Winners and Contestants Startups still operating
winners_operating = df_4.loc['Winner','Startup']
contestants_operating = df_4.loc['Contestant','Startup']
winners_operating, contestants_operating
(18, 214)
# Calculate the percentage of Winners and Contestants Startups still operating
per_win_op = round((winners_operating/total_contestants) * 100, 2)
per_con_op = round((contestants_operating/total_contestants) * 100, 2)
print("The percentage of winners that are still operating is {}% and percentage of contestants that are still operating is {}%".format(per_win_op, per_con_op))
The percentage of winners that are still operating is 5.77% and percentage of contestants that are still operating is 68.59%
H0 = The proportion of companies that are operating between winners and contestants is the same
Ha = The proportion of companies that are operating between winners and contestants is not the same
# Using SQL to manipulate dataframe df_3
# Calculate the number of Winners and Contestants Startups still operating
win_operating = psql.sqldf("select Funds_in_million from df_3 where OperatingState = 'Operating' and Result = 'Winner'")
con_operating = psql.sqldf("select Funds_in_million from df_3 where OperatingState = 'Operating' and Result = 'Contestant'")
# import z test to find z_stat, p_val
from statsmodels.stats.weightstats import ztest
alpha = 0.05 # considering 95% Confidence Interval
z_stat1, p_val1 = ztest(win_operating, con_operating)
z_stat1, p_val1
(array([0.08151046]), array([0.93503601]))
# check if calculated p_val is greater/ less than alpha
if p_val1 > alpha:
print("Since p_val of {} is greater than alpha value of {}, we Fail to reject H0".format(round(0.93503601,2), alpha))
else:
print("Since p_val of {} is less than alpha value of {}, we Reject H0".format(round(0.93503601,2), alpha))
Since p_val of 0.94 is greater than alpha value of 0.05, we Fail to reject H0
# find the z critical at alpha = 0.05
z_crit = norm.isf(0.05/2)
z_crit
1.9599639845400545
# check if calculated z_stat is greater/ less than z_crit
if z_crit > z_stat:
print("Since z_crit of {} is greater than z_stat of {}, we Fail to reject H0".format(round(z_crit,2), round(0.08151046,2)))
else:
print("Since z_crit of {} is less than z_stat of {}, we Reject H0".format(round(z_crit,2), round(0.08151046,2)))
Since z_crit of 1.96 is greater than z_stat of 0.08, we Fail to reject H0
# Add a new field to dataframe df_3 to check the Year values present in Event names
df_3['Event_Year'] = df_3.Event.apply(lambda x : x[-4:])
# Select only the Event that has ‘disrupt’ keyword from 2013 onwards.
df_5 = psql.sqldf("select * from df_3 where lower(Event) like 'disrupt%' and Event_Year >= 2013")
df_5.head()
| Startup | Product | Funding | Event | Result | OperatingState | Funds_in_million | Event_Year | |
|---|---|---|---|---|---|---|---|---|
| 0 | 3DLT | 3dlt.com | $630K | Disrupt NYC 2013 | Contestant | Closed | 0.63 | 2013 |
| 1 | 3Dprintler | 3dprintler.com | $1M | Disrupt NY 2016 | Audience choice | Operating | 1.00 | 2016 |
| 2 | Agrilyst | agrilyst.com | $1M | Disrupt SF 2015 | Winner | Operating | 1.00 | 2015 |
| 3 | Aiden | aiden.ai | $750K | Disrupt London 2016 | Contestant | Operating | 0.75 | 2016 |
| 4 | Aircall | aircall.io | $11.6M | Disrupt SF 2015 | Contestant | Operating | 11.60 | 2015 |
# Check the rowsxcolumns in dataframe df_5
df_5.shape
(193, 8)